package paper.pdf;

import paper.pdf.TextExtractor;

import java.io.IOException;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.regex.*;
import java.util.Iterator;

import paper.token.TT;
import paper.token.Token;
import paper.token.DocumentHead;
import paper.token.BasicToken;

import paper.parse.*;

public class ExtractGroundTruth extends TextExtractor {
    private boolean inTitle = true;
    private String titleBlock = "";
    String gtName;

    void listNone() throws FileNotFoundException {
    }
    
    void listFirst() throws FileNotFoundException {
    }

    void listLast() {
    }

    void startFile(String pdfFile, String pdfAbs) throws FileNotFoundException {
        gtName = pdfFile.replace(".pdf",".gt");
        inTitle = true;
        titleBlock = "";
    }

    void doneFile() {
    }

    void closeFile() {
    }

    void errorOut(String s) {
        System.err.println(s);
    }

    public void process()  {
        int i ;
        Pattern ref = Pattern.compile("(^ *abstract(\\.)? *$)|(^ *abstract[^:] .*)|(^(1.? +)? *introduction(\\.)? *$)");
        while ( (i = lines.indexOf("\n")) != -1) {
            String first = lines.substring(0,i);
            if (ref.matcher(first.toLowerCase()).matches()) {
                inTitle = false;
            }
            if (titleBlock.length() < 1600 && inTitle) {
                titleBlock += " \n" + first;
            }
            lines = lines.substring(i+1);
        }
    }

    public void extract() {
        PrintWriter pw = null;
        try {
            pw = new PrintWriter("groundTruths/" + gtName);
        } catch(Exception e) {
            System.out.println("Huh " + e);
        }
        DocumentHead dh = new DocumentHead(titleBlock);
        BasicToken.distill(dh);
        BasicToken.tokenise(dh);

        Iterator<Token> i = dh.iterator();
        while (i.hasNext()) {
            Token x = i.next();
            pw.print("|" + x.text);
        }
        pw.flush();
        pw.close();
    }

    public static void main( String[] args ) throws Exception {
        new ExtractGroundTruth().runMain(args);
    }

}


